In this notebook, we will train an Latend Dirichlet Allocation (LDA) model on tweets to learn a set of words which commonly appear together, hopefully corresponding to a topic. We will apply the LDA training on the whole corpus of our tweets and extract 10 topics. Additionally, we will visualize the results using the pyLDAvis library.

Following, we will take these results to a different notebook for analysis. There, we will assign a topic distribution on each tweet given the words used in it and we will sum the topic distributions of all tweets corresponding to a state to conclude to the topic distribution per state.



In [2]:

    
from pymongo import MongoClient
import json

client = MongoClient()
db = client.Twitter



In [3]:

    
import pandas as pd
import time
import re
from nltk.tokenize import RegexpTokenizer
import HTMLParser # In Python 3.4+ import html 
import nltk
from nltk.corpus import stopwords

LOAD data from Mongo



In [4]:

    
start_time = time.time()
#we are filtering out tweets of different languages and outside of the US
filter_query = { 
    "$and":[ {"place.country_code":"US"}, { "lang": "en" } ]
    }
#we are keeping only our fields of interest
columns_query = {
    'text':1,
    'entities.hashtags':1,
    'entities.user_mentions':1,
    'place.full_name':1,
    'place.bounding_box':1
}

tweets = pd.DataFrame(list(db.tweets.find(
    filter_query, 
    columns_query
    )#.limit()
                               )
                          )

elapsed_time = time.time() - start_time
print elapsed_time









    



16.0380530357

Preproccessing



In [5]:

    
#parse state variable
tweets['state'] = map(lambda place_dict: place_dict['full_name'][-2:] ,tweets['place'])



In [ ]:



In [6]:

    
tweets['state'].value_counts().head()









    Out[6]:





SA    130964
CA     57675
NY     37822
FL     34685
TX     27640
Name: state, dtype: int64



In [7]:

    
# #for one state only
# state = 'CA'
# tweets = tweets[tweets['state']==state]
len(tweets)









    Out[7]:





517724



In [ ]:



In [8]:

    
def Clean(unescaped_tweet):
    '''This function takes a tweet as input and returns a tokenizing list.'''
    
    tokenizer = RegexpTokenizer(r'\w+')
    cleaned_tweet_tokens = tokenizer.tokenize(unescaped_tweet.lower())
    
    return cleaned_tweet_tokens


start_time = time.time() #Starts time

tweets['text'] = tweets['text'].apply(lambda tweet: re.sub(r"http\S+", "", tweet))

#########################################################

def trump_mention(tweet):
    trump_count = 0
    if ('trump' in tweet.lower()) or ('donald'  in tweet.lower()):
        
        return True
        
    return False

tweets['Trump'] = tweets['text'].apply(lambda tweet: trump_mention(tweet))

##############################################################

#tweet mentions --->@
#tweet hashtags --->#

#create two column with the the hashtags and the mentions
tweets['mentions'] = tweets['text'].apply(lambda tweet: re.findall(r'\@\w+',tweet))
tweets['hashtags'] = tweets['text'].apply(lambda tweet: re.findall(r'\#\w+',tweet))

#remove hashtags and mentions
tweets['text'] = tweets['text'].apply(lambda tweet: re.sub(r"\@\w+" , "", tweet))
tweets['text'] = tweets['text'].apply(lambda tweet: re.sub(r"\#\w+" , "", tweet))

#remove the numbers from the text
tweets['text'] =tweets['text'].apply(lambda tweet: ''.join([i for i in tweet if not i.isdigit()]))

trump_count = 0
clinton_count =0


#remove the names and surnames of the two candidates
tweets['text'] =tweets['text'].apply(lambda tweet: re.sub(r"Trump" , "", tweet))
tweets['text'] =tweets['text'].apply(lambda tweet: re.sub(r"Clinton" , "", tweet))
tweets['text'] =tweets['text'].apply(lambda tweet: re.sub(r"Donald" , "", tweet))
tweets['text'] =tweets['text'].apply(lambda tweet: re.sub(r"Hillary" , "", tweet))
tweets['text'] =tweets['text'].apply(lambda tweet: re.sub(r"USA" , "", tweet))
tweets['text'] =tweets['text'].apply(lambda tweet: re.sub(r"amp" , "", tweet))



#tokenize the text and add an extra column
tweets['token'] = tweets['text'].apply(lambda tweet: Clean(tweet))

tweets['token'] = tweets['token'].apply(lambda x: list(set(x)-set(stopwords.words('english'))))

elapsed_time = time.time() - start_time #time ends
print elapsed_time

tweets.head()









    



75.8121948242






    Out[8]:







  
    
      
      _id
      entities
      place
      text
      state
      Trump
      mentions
      hashtags
      token
    
  
  
    
      0
      59c10dfc66b388bfad9235af
      {u'user_mentions': [{u'id': 813286, u'indices'...
      {u'bounding_box': {u'type': u'Polygon', u'coor...
      \n\n \nALL IN COLLUSION TOGETHER \n\n \n\n \n...
      LA
      True
      [@BarackObama, @FBI, @LORETTALYNCH, @realDonal...
      [#NOJUSTICE, #TrumpPence]
      [collusion, together]
    
    
      1
      59c10dfc66b388bfad9235b3
      {u'user_mentions': [], u'hashtags': [{u'indice...
      {u'bounding_box': {u'type': u'Polygon', u'coor...
      clear  deliberately throwing this race,in  h...
      MD
      True
      []
      [#CNN, #newday, #Trump, #ISIS]
      [mideast, destabilization, throwing, clear, kn...
    
    
      2
      59c10dfc66b388bfad9235b4
      {u'user_mentions': [{u'id': 4852163069, u'indi...
      {u'bounding_box': {u'type': u'Polygon', u'coor...
      I TOTALLY CONCUR!! This Election is just CRA...
      MD
      True
      [@mike4193496, @realDonaldTrump]
      []
      [blowing, last, mind, n, cra, election, concur...
    
    
      3
      59c10dfc66b388bfad9235b5
      {u'user_mentions': [{u'id': 1339835893, u'indi...
      {u'bounding_box': {u'type': u'Polygon', u'coor...
      you ARE the co-founder of ISIS, you crooked, ...
      TX
      False
      [@HillaryClinton]
      []
      [co, founder, crooked, evil, live, witch, isis...
    
    
      4
      59c10dfc66b388bfad9235b6
      {u'user_mentions': [{u'id': 25073877, u'indice...
      {u'bounding_box': {u'type': u'Polygon', u'coor...
      , you wouldn't recognize a lie if it came from...
      CA
      True
      [@realDonaldTrump]
      [#NeverTrump]
      [lie, continually, mouth, came, recognize]



In [9]:

    
tweets.head()









    Out[9]:







  
    
      
      _id
      entities
      place
      text
      state
      Trump
      mentions
      hashtags
      token
    
  
  
    
      0
      59c10dfc66b388bfad9235af
      {u'user_mentions': [{u'id': 813286, u'indices'...
      {u'bounding_box': {u'type': u'Polygon', u'coor...
      \n\n \nALL IN COLLUSION TOGETHER \n\n \n\n \n...
      LA
      True
      [@BarackObama, @FBI, @LORETTALYNCH, @realDonal...
      [#NOJUSTICE, #TrumpPence]
      [collusion, together]
    
    
      1
      59c10dfc66b388bfad9235b3
      {u'user_mentions': [], u'hashtags': [{u'indice...
      {u'bounding_box': {u'type': u'Polygon', u'coor...
      clear  deliberately throwing this race,in  h...
      MD
      True
      []
      [#CNN, #newday, #Trump, #ISIS]
      [mideast, destabilization, throwing, clear, kn...
    
    
      2
      59c10dfc66b388bfad9235b4
      {u'user_mentions': [{u'id': 4852163069, u'indi...
      {u'bounding_box': {u'type': u'Polygon', u'coor...
      I TOTALLY CONCUR!! This Election is just CRA...
      MD
      True
      [@mike4193496, @realDonaldTrump]
      []
      [blowing, last, mind, n, cra, election, concur...
    
    
      3
      59c10dfc66b388bfad9235b5
      {u'user_mentions': [{u'id': 1339835893, u'indi...
      {u'bounding_box': {u'type': u'Polygon', u'coor...
      you ARE the co-founder of ISIS, you crooked, ...
      TX
      False
      [@HillaryClinton]
      []
      [co, founder, crooked, evil, live, witch, isis...
    
    
      4
      59c10dfc66b388bfad9235b6
      {u'user_mentions': [{u'id': 25073877, u'indice...
      {u'bounding_box': {u'type': u'Polygon', u'coor...
      , you wouldn't recognize a lie if it came from...
      CA
      True
      [@realDonaldTrump]
      [#NeverTrump]
      [lie, continually, mouth, came, recognize]



In [10]:

    
#test['tags'] = map(lambda tweet: map(lambda tweet: tweet['text'] , tweet['entities']['hashtags']) if tweet['entities']['hashtags'] != None else None, raw_tweet[:100])
#tweets['text'][9]



In [11]:

    
doc_complete = tweets['token'].tolist()
doc_complete[:2]









    Out[11]:





[[u'collusion', u'together'],
 [u'mideast',
  u'destabilization',
  u'throwing',
  u'clear',
  u'knew',
  u'started',
  u'race',
  u'deliberately',
  u'w',
  u'invasion',
  u'iraq']]



In [12]:

    
import gensim



In [13]:

    
import pickle



In [14]:

    
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index
dictionary = corpora.Dictionary(doc_complete)
pickle.dump(dictionary, open( 'dictionary2.pickle', "wb" ) )

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_complete]
pickle.dump(doc_term_matrix, open( 'doc_term_matrix.pickle', "wb" ) )



In [ ]:



In [15]:

    
Lda = gensim.models.ldamulticore.LdaMulticore



In [ ]:

    
nr_topics = 10
nr_passes = 100

start_time = time.time()
# Creating the object for LDA model using gensim library

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=nr_topics, id2word = dictionary, passes=nr_passes)

elapsed_time = time.time() - start_time



In [18]:

    
print 'Topic modelling for', nr_topics,'topics,', nr_passes,'passes,',len(tweets),'tweets:','\ncomplete in',elapsed_time/60.,'minutes'









    



Topic modelling for 10 topics, 100 passes, 517724 tweets: 
complete in 88.7028 minutes



In [ ]:

    
# Runtimes:
# Florida (~4K) ~ 16 min on 10 topics, 300 passes
# CA (57K) - 48 min on 10 topics 300 passes



In [ ]:

    
# can we do it on the whole data -> take the topics and classify each tweet within them.
# then we have discrete sets with topics and words weights in each topic.
# so then isn't a tweet represented by the appropriate values?



In [ ]:

    
# Print 2 topics and describe then with 4 words.
topics = ldamodel.print_topics(num_topics=nr_topics, num_words=50)
i=0
for topic in topics:
    print topic
    print ""
    i+=1

save/load the model



In [5]:

    
import pickle



In [7]:

    
nr_topics = 10
nr_passes = 100



In [8]:

    
state = 'allstates'
name = "trained models/lda/%s_%itopics_%ipasses.pickle"%(state,nr_topics,nr_passes)
print "Procceed to save model in:", name









    



Procceed to save model in: trained models/lda/allstates_10topics_100passes.pickle



In [ ]:

    
pickle.dump(ldamodel, open( name, "wb" ) )



In [9]:

    
#load
ldamodel = pickle.load(open(name,'rb'))









    



/home/antonis/anaconda2/envs/USelections/lib/python2.7/site-packages/urllib3/contrib/pyopenssl.py:46: DeprecationWarning: OpenSSL.rand is deprecated - you should use os.urandom instead
  import OpenSSL.SSL
/home/antonis/anaconda2/envs/USelections/lib/python2.7/site-packages/scipy/sparse/sparsetools.py:20: DeprecationWarning: `scipy.sparse.sparsetools` is deprecated!
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()

efforts with pyLDAvis (visualize the LDA topics)



In [13]:

    
import time



In [10]:

    
import pyLDAvis.gensim
pyLDAvis.enable_notebook()



In [14]:

    
#load the LDA results (model, dictionary and corpus)
start_time = time.time()

ldamodel = pickle.load(open('trained models/lda/allstates_10topics_100passes.pickle'))
dictandcorpus = pickle.load(open('trained models/lda/Dictionary.pickle'))
c = dictandcorpus[1]
d = dictandcorpus[0]
del dictandcorpus

elapsed_time = time.time() - start_time
print elapsed_time









    



19.5377728939



In [ ]:



In [ ]:



In [18]:

    
data = pyLDAvis.gensim.prepare(ldamodel, c, d)









    



/home/antonis/anaconda2/envs/USelections/lib/python2.7/site-packages/pyLDAvis/_prepare.py:387: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]



In [19]:

    
data









    Out[19]:

Interactive version available here

Also available as an html document in this git repository (LDA_topics.html)



In [ ]:

    
#save results as an html file
pyLDAvis.save_html(data, open('LDA topics.html','wb'))



In [ ]:



In [ ]:

	_id	entities	place	text	state	Trump	mentions	hashtags	token
0	59c10dfc66b388bfad9235af	{u'user_mentions': [{u'id': 813286, u'indices'...	{u'bounding_box': {u'type': u'Polygon', u'coor...	\n\n \nALL IN COLLUSION TOGETHER \n\n \n\n \n...	LA	True	[@BarackObama, @FBI, @LORETTALYNCH, @realDonal...	[#NOJUSTICE, #TrumpPence]	[collusion, together]
1	59c10dfc66b388bfad9235b3	{u'user_mentions': [], u'hashtags': [{u'indice...	{u'bounding_box': {u'type': u'Polygon', u'coor...	clear deliberately throwing this race,in h...	MD	True	[]	[#CNN, #newday, #Trump, #ISIS]	[mideast, destabilization, throwing, clear, kn...
2	59c10dfc66b388bfad9235b4	{u'user_mentions': [{u'id': 4852163069, u'indi...	{u'bounding_box': {u'type': u'Polygon', u'coor...	I TOTALLY CONCUR!! This Election is just CRA...	MD	True	[@mike4193496, @realDonaldTrump]	[]	[blowing, last, mind, n, cra, election, concur...
3	59c10dfc66b388bfad9235b5	{u'user_mentions': [{u'id': 1339835893, u'indi...	{u'bounding_box': {u'type': u'Polygon', u'coor...	you ARE the co-founder of ISIS, you crooked, ...	TX	False	[@HillaryClinton]	[]	[co, founder, crooked, evil, live, witch, isis...
4	59c10dfc66b388bfad9235b6	{u'user_mentions': [{u'id': 25073877, u'indice...	{u'bounding_box': {u'type': u'Polygon', u'coor...	, you wouldn't recognize a lie if it came from...	CA	True	[@realDonaldTrump]	[#NeverTrump]	[lie, continually, mouth, came, recognize]